Code
##| echo: true #This can be added to selectively show specific code chunks.
# Load dataset
data_path <- file.path(dirname(dirname(here())), "Data", "fraud.csv")
df <- fread(data_path)Data Understanding and Exploration
This document presents an initial exploratory analysis of the data set related to self-checkout fraud detection. The focus is on understanding the structure of the fraud.csv data set before proceeding with further analysis.
##| echo: true #This can be added to selectively show specific code chunks.
# Load dataset
data_path <- file.path(dirname(dirname(here())), "Data", "fraud.csv")
df <- fread(data_path)# Dimensions of the dataset
cat("The dataset contains", num_rows, "rows and", num_cols, "columns.\n")The dataset contains 498121 rows and 10 columns.
skim(df)| Name | df |
| Number of rows | 498121 |
| Number of columns | 10 |
| Key | NULL |
| _______________________ | |
| Column type frequency: | |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| trustLevel | 0 | 1 | 3.50 | 1.71 | 1 | 2.00 | 4.00 | 5.00 | 6.00 | ▇▅▅▅▅ |
| totalScanTimeInSeconds | 0 | 1 | 915.61 | 528.77 | 1 | 458.00 | 916.00 | 1374.00 | 1831.00 | ▇▇▇▇▇ |
| grandTotal | 0 | 1 | 49.99 | 28.87 | 0 | 24.93 | 50.03 | 75.02 | 99.99 | ▇▇▇▇▇ |
| lineItemVoids | 0 | 1 | 5.50 | 3.45 | 0 | 3.00 | 5.00 | 8.00 | 11.00 | ▇▆▆▅▇ |
| scansWithoutRegistration | 0 | 1 | 5.00 | 3.16 | 0 | 2.00 | 5.00 | 8.00 | 10.00 | ▇▅▅▅▆ |
| quantityModifications | 0 | 1 | 2.50 | 1.71 | 0 | 1.00 | 2.00 | 4.00 | 5.00 | ▇▃▃▅▃ |
| scannedLineItemsPerSecond | 0 | 1 | 0.07 | 0.52 | 0 | 0.01 | 0.02 | 0.03 | 30.00 | ▇▁▁▁▁ |
| valuePerSecond | 0 | 1 | 0.22 | 1.72 | 0 | 0.03 | 0.05 | 0.11 | 99.71 | ▇▁▁▁▁ |
| lineItemVoidsPerPosition | 0 | 1 | 0.74 | 1.32 | 0 | 0.16 | 0.35 | 0.69 | 11.00 | ▇▁▁▁▁ |
| fraud | 0 | 1 | 0.05 | 0.21 | 0 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
cat(paste(names(df), "➡️", sapply(df, class)), sep = "\n")trustLevel ➡️ integer
totalScanTimeInSeconds ➡️ integer
grandTotal ➡️ numeric
lineItemVoids ➡️ integer
scansWithoutRegistration ➡️ integer
quantityModifications ➡️ integer
scannedLineItemsPerSecond ➡️ numeric
valuePerSecond ➡️ numeric
lineItemVoidsPerPosition ➡️ numeric
fraud ➡️ integer
cat("The dataset contains", sum(missing_values), "missing values and ", duplicate_count, " duplicates.\n")The dataset contains 0 missing values and 0 duplicates.
#Creation of correlation matrix
cor_matrix <- cor(df, use = "complete.obs") #Handle missing values if present.
#print(cor_matrix)
#Visualization usng corrplot
corrplot(cor_matrix, method = "color", col = COL1("YlOrRd"),
tl.col = "grey30", tl.srt = 45, tl.cex = 0.675 ,addCoef.col = "white",
number.cex = 0.8, addgrid.col = "white")
fraud# Create the bar chart with count labels
t_hist <- ggplot(df, aes(x = factor(fraud))) +
geom_bar(fill = "steelblue", alpha = 0.7) +
geom_text(stat = "count", aes(label = after_stat(count)), size = 5) +
labs(title = "Distribution of Variable \"Fraud\"",
x = "Fraud",
y = "Count") +
theme_minimal()
ggplotly(t_hist) # Display static plottrustLevel# Create the stacked bar chart
t_hist <- ggplot(df, aes(x = factor(trustLevel), fill = factor(fraud))) +
geom_bar(position = "stack", alpha = 0.7) +
labs(title = "Trust Level Distribution with Fraud Breakdown",
x = "Trust Level",
y = "Count",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) + # Custom colors
theme_minimal()
# Interactive version
ggplotly(t_hist)totalScanTimeInSeconds with Fraud Breakdown# Boxplot for totalScanTimeInSeconds by fraud status
boxplot_scan_time <- ggplot(df, aes(x = factor(fraud), y = totalScanTimeInSeconds, fill = factor(fraud))) +
geom_boxplot(alpha = 0.7, outlier.color = "red") +
labs(title = "Total Scan Time by Fraud Status",
x = "Fraud Status",
y = "Total Scan Time (Seconds)",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +
theme_minimal()
ggplotly(boxplot_scan_time) # Interactive version# Density plot for totalScanTimeInSeconds by fraud status
density_scan_time <- ggplot(df, aes(x = totalScanTimeInSeconds, fill = factor(fraud))) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot of Total Scan Time by Fraud Status",
x = "Total Scan Time (Seconds)",
y = "Density",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +
theme_minimal()
ggplotly(density_scan_time) # Interactive versiongrandTotal with Fraud Breakdown# Boxplot for grandTotal by fraud status
boxplot_grand_total <- ggplot(df, aes(x = factor(fraud), y = grandTotal, fill = factor(fraud))) +
geom_boxplot(alpha = 0.7, outlier.color = "red") +
labs(title = "Grand Total by Fraud Status",
x = "Fraud Status",
y = "Transaction Amount (Grand Total)",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +
theme_minimal()
ggplotly(boxplot_grand_total) # Interactive version# Stacked Histogram of grandTotal by fraud status
hist_grand_total <- ggplot(df, aes(x = grandTotal, fill = factor(fraud))) +
geom_histogram(bins = 30, color = "black", alpha = 0.7, position = "stack") +
labs(title = "Distribution of Grand Total with Fraud Breakdown",
x = "Transaction Amount (Grand Total)",
y = "Count",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) + # Custom colors
theme_minimal()
ggplotly(hist_grand_total) # Interactive versionlineItemVoids with Fraud Breakdown# Boxplot for lineItemVoids by fraud status
boxplot_lineItemVoids <- ggplot(df, aes(x = factor(fraud), y = lineItemVoids, fill = factor(fraud))) +
geom_boxplot(alpha = 0.7, outlier.color = "red") +
labs(title = "lineItemVoids by Fraud Status",
x = "Fraud Status",
y = "lineItemVoids",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +
theme_minimal()
ggplotly(boxplot_lineItemVoids) # Interactive version# Stacked Histogram of lineItemVoids by fraud status
hist_lineItemVoids <- ggplot(df, aes(x = lineItemVoids, fill = factor(fraud))) +
geom_histogram(bins = 10, color = "black", alpha = 0.7, position = "stack") +
labs(title = "Distribution of lineItemVoids with Fraud Breakdown",
x = "lineItemVoids",
y = "Count",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) + # Custom colors
theme_minimal()
ggplotly(hist_lineItemVoids) # Interactive versionscansWithoutRegistration with Fraud Breakdown# Boxplot for scansWithoutRegistration by fraud status
boxplot_scansWithoutRegistration <- ggplot(df, aes(x = factor(fraud), y = scansWithoutRegistration, fill = factor(fraud))) +
geom_boxplot(alpha = 0.7, outlier.color = "red") +
labs(title = "scansWithoutRegistration by Fraud Status",
x = "Fraud Status",
y = "scansWithoutRegistration",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +
theme_minimal()
ggplotly(boxplot_scansWithoutRegistration) # Interactive version# Stacked Histogram of scansWithoutRegistration by Fraud Status
hist_scansWithoutRegistration <- ggplot(df, aes(x = scansWithoutRegistration, fill = factor(fraud))) +
geom_histogram(bins = 10, color = "black", alpha = 0.7, position = "stack") +
labs(title = "Distribution of scansWithoutRegistration with Fraud Breakdown",
x = "scansWithoutRegistration",
y = "Count",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) + # Custom colors
theme_minimal()
ggplotly(hist_scansWithoutRegistration) # Interactive versionquantityModifications with Fraud Breakdown# Boxplot for quantityModifications by fraud status
boxplot_quantityModifications <- ggplot(df, aes(x = factor(fraud), y = quantityModifications, fill = factor(fraud))) +
geom_boxplot(alpha = 0.7, outlier.color = "red") +
labs(title = "quantityModifications by Fraud Status",
x = "Fraud Status",
y = "quantityModifications",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) +
theme_minimal()
ggplotly(boxplot_quantityModifications) # Interactive version# Stacked Histogram of quantityModifications by Fraud Status
hist_quantityModifications <- ggplot(df, aes(x = quantityModifications, fill = factor(fraud))) +
geom_histogram(bins = 5, color = "black", alpha = 0.7, position = "stack") +
labs(title = "Distribution of quantityModifications with Fraud Breakdown",
x = "quantityModifications",
y = "Count",
fill = "Fraud Status") +
scale_fill_manual(values = c("0" = "steelblue", "1" = "red")) + # Custom colors
theme_minimal()
ggplotly(hist_quantityModifications) # Interactive version